In [50]:
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
In [51]:
import config
from config import BasePaths
from CellLineWork.CellLineExpressionMatrix import CellLineExpressionMatrix
In [52]:
import scipy
from scipy.cluster.hierarchy import linkage, dendrogram
from matplotlib import pyplot as plt
import matplotlib
import seaborn as sns
import numpy as np
import math
import pandas as pd
import pickle

This performs the following:

  1. Normalize read counts per cell (divide by sum of reads in cell)
  2. Change to TPM values (x10^6)
  3. Filter cells (>4000 non zero genes)
  4. Filter genes (log2(mean(TPM)+1)>=3
  5. Transform values to log TMP by log2((TPM/10)+1)
  6. Center values per gene
In [53]:
exp_obj = CellLineExpressionMatrix().execute()
Logging into /Users/iriskalka/Dropbox (Weizmann Institute)/CacheCopy/PipelineCache/CellLineExpressionMatrix2018_04_22_08_55.log
Creating new instance of expression matrix
Running pipeline step <class 'PreprocessingPipeline.Transformers.UploadMatExpressionMatrix.UploadMatExpressionMatrix'>
Running pipeline step <class 'PreprocessingPipeline.Transformers.NormalizeExpressionByCell.NormalizeExpressionByCell'>
Running pipeline step <class 'PreprocessingPipeline.Transformers.CellFilteringByReadsThreshold.CellFilteringByReadsThreshold'>
Running pipeline step <class 'PreprocessingPipeline.Transformers.FilterGenesByPopulationExpression.FilterGenesByPopulationExpression'>
Running pipeline step <class 'PreprocessingPipeline.Transformers.TransformToTPMAndCenter.TransformToTPMAndCenter'>
Running pipeline step <class 'PreprocessingPipeline.Transformers.ClusterExpressionIntoTree.ClusterExpressionIntoTree'>
<Figure size 432x288 with 0 Axes>

Cluster all cells (just to see how it is done)

CHANGED TO AVERAGE METHOD HERE, CHECK THIS!!

In [54]:
def create_clustering(expression_mat, show=False, p=100):
    # Create correlation based distance matrix
    corr_dist_matrix = scipy.spatial.distance.cdist(expression_mat.transpose(), 
                                                    expression_mat.transpose(), 'correlation')
    corr_dist_condensed = scipy.spatial.distance.pdist(expression_mat.transpose(), metric='correlation')
    
    print(corr_dist_matrix.shape, len(corr_dist_condensed), 
          expression_mat.shape, expression_mat.shape[1]*(expression_mat.shape[1]-1)/2)
    # Create tree
    corr_tree = linkage(corr_dist_condensed, method='average')

    # Create dendrogram
    corr_dn = dendrogram(corr_tree)
    
    if show:
        fig, ax = plt.subplots()
        ax.set_title('Hierarchical Clustering Dendrogram (truncated)')
        ax.set_xlabel('sample index')
        ax.set_ylabel('distance')
        dendrogram(
            corr_tree,
            truncate_mode='lastp',  # show only the last p merged clusters
            p=p,  # show only the last p merged clusters
            show_leaf_counts=False,  # otherwise numbers in brackets are counts
            leaf_rotation=90.,
            leaf_font_size=12.,
            show_contracted=True,  # to get a distribution impression in truncated branches
            ax=ax
        )
        plt.show()
    return corr_dist_matrix, corr_tree, corr_dn
In [55]:
def show_correlation_heatmap(expression_matrix, dendogram, corr_matrix=None, vmax=None, vmin=None):
    plt.subplot()
    if corr_matrix is None:
        corr_matrix = np.corrcoef(expression_matrix.iloc[:, dendogram['leaves']].transpose())
        mask = corr_matrix==1
        if vmin is None:
            vmin = -max(np.absolute(np.percentile(corr_matrix[corr_matrix!=1], 95)), 
                        np.absolute(np.percentile(corr_matrix[corr_matrix!=1], 0.05)))
        if vmax is None:
            vmax = max(np.absolute(np.percentile(corr_matrix[corr_matrix!=1], 95)), 
                        np.absolute(np.percentile(corr_matrix[corr_matrix!=1], 0.05)))
        print('vmax={}, vmin={}'.format(vmax, vmin))
        sns.heatmap(corr_matrix, xticklabels=False, yticklabels=False, cmap='seismic', vmax=vmax, vmin=vmin)
        print(corr_matrix.shape)
    else:
        sns.heatmap(pd.DataFrame(corr_matrix).iloc[dendogram['leaves'], dendogram['leaves']], 
                xticklabels=False, yticklabels=False, cmap='seismic', vmin=-1, vmax=1)
    plt.show()
In [56]:
full_dist, full_tree, full_dn = create_clustering(exp_obj.expression_matrix, show=True)
show_correlation_heatmap(exp_obj.expression_matrix, full_dn)
(2635, 2635) 3470295 (9127, 2635) 3470295.0
vmax=0.18092029825799724, vmin=-0.18092029825799724
(2635, 2635)
In [57]:
show_correlation_heatmap(exp_obj.expression_matrix, full_dn, vmin=-0.3, vmax=0.3)
vmax=0.3, vmin=-0.3
(2635, 2635)
In [58]:
plt.figure(figsize=(10,10))
ax = plt.subplot()
dendrogram(
            full_tree,
            truncate_mode='lastp',  # show only the last p merged clusters
            p=50,  # show only the last p merged clusters
            show_leaf_counts=False,  # otherwise numbers in brackets are counts
            leaf_rotation=90.,
            leaf_font_size=12.,
            show_contracted=True,  # to get a distribution impression in truncated branches,
            ax=ax
        )
ax.set_ylim(bottom=0.74, top=0.84)
plt.show()

Separate to cell lines

In [59]:
cell_lines_path = BasePaths.CellLineExpression
columns_to_compare=['GE_CCLE_match', 'SNP_CL_match']
cell_id_column="sample_id"
In [60]:
def get_cohesive_cell_lines(csv_path, columns_to_compare, id_column):
    cell_lines = pd.read_csv(csv_path)
    cohesive_cells_lines = cell_lines.loc[
        cell_lines.apply(lambda x: len(np.unique(x[columns_to_compare])) == 1, axis=1)]
    return cohesive_cells_lines[[id_column, columns_to_compare[0]]].rename(
        columns={columns_to_compare[0]: "CellLine"})
In [61]:
cohesive_cell_lines = get_cohesive_cell_lines(cell_lines_path, columns_to_compare, cell_id_column)
cell_lines = set(cohesive_cell_lines['CellLine'].values)
cohesive_cell_lines.groupby('CellLine').count().sort_values(by='sample_id', ascending=False)
Out[61]:
sample_id
CellLine
TE14_OESOPHAGUS 225
8305C_THYROID 185
OVCAR4_OVARY 149
ONCODG1_OVARY 146
CAKI2_KIDNEY 138
NCIH2228_LUNG 130
SNU46_UPPER_AERODIGESTIVE_TRACT 125
KALS1_CENTRAL_NERVOUS_SYSTEM 115
SKMEL5_SKIN 103
U118MG_CENTRAL_NERVOUS_SYSTEM 102
UACC257_SKIN 100
ASPC1_PANCREAS 97
SCC25_UPPER_AERODIGESTIVE_TRACT 88
BICR16_UPPER_AERODIGESTIVE_TRACT 87
NCIH2444_LUNG 79
HUH28_BILIARY_TRACT 76
OAW28_OVARY 72
HEC6_ENDOMETRIUM 65
IM95_STOMACH 44
HCC366_LUNG 39
NCIH2073_LUNG 36
SNU1214_UPPER_AERODIGESTIVE_TRACT 30
TE10_OESOPHAGUS 26
SU8686_PANCREAS 17
In [62]:
def get_cohesive_expression_obj(exp_obj, cohesive_cell_lines):
    cohesive_cell_indices = list(map(lambda x: x in cohesive_cell_lines.sample_id.values, exp_obj.cells_matrix))
    cohesive_cell_indices = [c[0] for c in enumerate(cohesive_cell_indices) if c[1]]
    cohesive_exp_matrix = exp_obj.expression_matrix.loc[:, cohesive_cell_indices]
    print(cohesive_exp_matrix.shape, exp_obj.expression_matrix.shape)
    return cohesive_exp_matrix
In [63]:
def change_cells_index_to_barcodes(exp_mat, cell_name):
    exp_mat = exp_mat.transpose()
    exp_mat['barcode'] = list(map(lambda x: cell_name[x], exp_mat.index))
    exp_mat.set_index('barcode', inplace=True)
    return exp_mat
In [64]:
cohesive_exp_matrix = get_cohesive_expression_obj(exp_obj, cohesive_cell_lines)
cohesive_exp_matrix = change_cells_index_to_barcodes(cohesive_exp_matrix, exp_obj.cells_matrix)
cohesive_exp_matrix['CellLine'] = cohesive_cell_lines.set_index('sample_id').loc[cohesive_exp_matrix.index]
cohesive_exp_matrix.head()
(9127, 2274) (9127, 2635)
Out[64]:
8 27 33 35 40 42 43 46 53 54 ... 32702 32703 32704 32705 32706 32707 32708 32717 32721 CellLine
barcode
AAACCTGCAAACGTGG-1 1.150069 -0.612891 -0.517273 0.898029 -0.211171 1.664156 0.214821 -0.645481 -0.251104 1.257913 ... 0.222684 1.832808 1.957944 0.614129 1.232325 2.272645 0.719477 1.263660 -0.691672 SCC25_UPPER_AERODIGESTIVE_TRACT
AAACCTGGTAAATACG-1 -1.617814 -0.612891 -0.517273 1.010792 -1.672097 -1.074126 -1.750702 1.174049 0.316563 0.576614 ... -1.285644 -1.454299 -2.951609 -0.985259 -0.027207 0.612259 -0.686145 1.897293 -0.691672 U118MG_CENTRAL_NERVOUS_SYSTEM
AAACCTGGTTAAGTAG-1 -1.617814 1.087895 2.805106 -0.300016 0.123544 -1.522396 -0.049915 1.055306 0.500878 -2.022542 ... 0.149358 0.282890 0.003024 -0.174310 -0.544563 -0.789441 0.230010 -0.701863 -0.691672 TE14_OESOPHAGUS
AAACCTGTCAATCTCT-1 0.767870 -0.612891 -0.517273 -0.374173 0.831501 -1.969217 -1.750702 1.740204 0.414136 1.218053 ... -1.122744 -1.957061 -0.565924 -1.243988 -1.586413 -0.851942 -1.119314 -0.701863 -0.691672 U118MG_CENTRAL_NERVOUS_SYSTEM
AAACCTGTCTTATCTG-1 -1.617814 -0.612891 -0.517273 -0.099966 -0.831734 0.412197 1.790282 1.226335 0.568649 0.637350 ... 0.527144 0.357142 0.886298 0.602200 0.192766 0.676452 1.087158 1.169953 -0.691672 HUH28_BILIARY_TRACT

5 rows × 9128 columns

In [65]:
cell_lines_dict = {}
for cell_line in set(cohesive_exp_matrix.CellLine.values):
    exp_mat = cohesive_exp_matrix.loc[
        cohesive_exp_matrix.CellLine==cell_line,
        [col for col in cohesive_exp_matrix.columns if col!='CellLine']].transpose()
    if exp_mat.shape[1]>=50:
        cell_lines_dict[cell_line] = exp_mat

Cluster each cell line

In [66]:
for cell_line, exp_matrix in cell_lines_dict.items():
    print('working on {}'.format(cell_line))
    corr_dist, tree, dn = create_clustering(exp_matrix, show=True)
    show_correlation_heatmap(exp_matrix, dn, vmin=0.0)
working on TE14_OESOPHAGUS
(225, 225) 25200 (9127, 225) 25200.0
vmax=0.38730654372129103, vmin=0.0
(225, 225)
working on BICR16_UPPER_AERODIGESTIVE_TRACT
(87, 87) 3741 (9127, 87) 3741.0
vmax=0.37585450582614643, vmin=0.0
(87, 87)
working on CAKI2_KIDNEY
(138, 138) 9453 (9127, 138) 9453.0
vmax=0.3824699519058153, vmin=0.0
(138, 138)
working on 8305C_THYROID
(185, 185) 17020 (9127, 185) 17020.0
vmax=0.30705181175807705, vmin=0.0
(185, 185)
working on NCIH2444_LUNG
(79, 79) 3081 (9127, 79) 3081.0
vmax=0.3403977730021431, vmin=0.0
(79, 79)
working on SCC25_UPPER_AERODIGESTIVE_TRACT
(88, 88) 3828 (9127, 88) 3828.0
vmax=0.33345660365627217, vmin=0.0
(88, 88)
working on U118MG_CENTRAL_NERVOUS_SYSTEM
(102, 102) 5151 (9127, 102) 5151.0
vmax=0.4042459737396351, vmin=0.0
(102, 102)
working on ASPC1_PANCREAS
(97, 97) 4656 (9127, 97) 4656.0
vmax=0.3533771217984693, vmin=0.0
(97, 97)
working on SKMEL5_SKIN
(103, 103) 5253 (9127, 103) 5253.0
vmax=0.34345050794659016, vmin=0.0
(103, 103)
working on HEC6_ENDOMETRIUM
(65, 65) 2080 (9127, 65) 2080.0
vmax=0.35462728641944397, vmin=0.0
(65, 65)
working on NCIH2228_LUNG
(130, 130) 8385 (9127, 130) 8385.0
vmax=0.32094640858763995, vmin=0.0
(130, 130)
working on HUH28_BILIARY_TRACT
(76, 76) 2850 (9127, 76) 2850.0
vmax=0.4448768766635951, vmin=0.0
(76, 76)
working on ONCODG1_OVARY
(146, 146) 10585 (9127, 146) 10585.0
vmax=0.4618959263064763, vmin=0.0
(146, 146)
working on KALS1_CENTRAL_NERVOUS_SYSTEM
(115, 115) 6555 (9127, 115) 6555.0
vmax=0.28248388403283864, vmin=0.0
(115, 115)
working on OAW28_OVARY
(72, 72) 2556 (9127, 72) 2556.0
vmax=0.47502753607592807, vmin=0.0
(72, 72)
working on OVCAR4_OVARY
(149, 149) 11026 (9127, 149) 11026.0
vmax=0.3803426359201004, vmin=0.0
(149, 149)
working on UACC257_SKIN
(100, 100) 4950 (9127, 100) 4950.0
vmax=0.39259193684166654, vmin=0.0
(100, 100)
working on SNU46_UPPER_AERODIGESTIVE_TRACT
(125, 125) 7750 (9127, 125) 7750.0
vmax=0.3434643595295079, vmin=0.0
(125, 125)

Test new threshold for genes (consider 4)

In [67]:
class HasName: pass
in_path = HasName()
in_path.name = ''
for pipe_step in  exp_obj.composing_items[:2]:
    in_path.name = pipe_step.out_file_path(in_path)
print(in_path.name)
with open(in_path.name, 'rb') as in_file:
    original_exp_obj = pickle.load(in_file)
original_exp_obj
/Users/iriskalka/Dropbox (Weizmann Institute)/CacheCopy/PipelineCache/FromMatExpressionMatrix_NormalizeExpressionByCell
Out[67]:
<PreprocessingPipeline.ExpressionAndMetaData.ExpressionMetaDataBase.ExpressionMetaDataBase at 0x1163e9d68>
In [68]:
original_exp_obj.expression_matrix = change_cells_index_to_barcodes(
    original_exp_obj.expression_matrix, 
    original_exp_obj.cells_matrix).transpose()
original_exp_obj.expression_matrix.head()
Out[68]:
barcode AAACCTGAGTGGAGTC-1 AAACCTGCAAACGTGG-1 AAACCTGCATTGTGCA-1 AAACCTGGTAAATACG-1 AAACCTGGTTAAGTAG-1 AAACCTGTCAATCTCT-1 AAACCTGTCCAAACTG-1 AAACCTGTCCCTCAGT-1 AAACCTGTCTTATCTG-1 AAACCTGTCTTGCCGT-1 ... TTTGGTTAGATCCTGT-1 TTTGGTTAGCTAACTC-1 TTTGGTTCACTAAGTC-1 TTTGGTTTCTGCTGCT-1 TTTGGTTTCTGGTATG-1 TTTGTCACAAACCCAT-1 TTTGTCACAGATCGGA-1 TTTGTCATCGTCGTTC-1 TTTGTCATCGTTACGA-1 TTTGTCATCTTATCTG-1
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 22.715399 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 2736 columns

In [69]:
tmp_dict = {}
for cell_line, exp_matrix in cell_lines_dict.items():
    print("working on cell line {}".format(cell_line))
    tmp_original_exp = original_exp_obj.expression_matrix.loc[exp_matrix.index, exp_matrix.columns]
    gene_sum_values = tmp_original_exp.values
    gene_sum_values = gene_sum_values.mean(axis=1)
    gene_sum_values += 1
    gene_sum_values = np.log2(gene_sum_values)
    fig, ax = plt.subplots(2, 1)
    ax[0].plot(range(len(gene_sum_values)), np.sort(gene_sum_values))
    ax[1].hist(gene_sum_values, bins=30)
    plt.show()
    tmp_dict[cell_line] = exp_matrix.iloc[gene_sum_values>4, :]
cell_lines_dict = tmp_dict
working on cell line TE14_OESOPHAGUS
working on cell line BICR16_UPPER_AERODIGESTIVE_TRACT
working on cell line CAKI2_KIDNEY
working on cell line 8305C_THYROID
working on cell line NCIH2444_LUNG
working on cell line SCC25_UPPER_AERODIGESTIVE_TRACT
working on cell line U118MG_CENTRAL_NERVOUS_SYSTEM
working on cell line ASPC1_PANCREAS
working on cell line SKMEL5_SKIN
working on cell line HEC6_ENDOMETRIUM
working on cell line NCIH2228_LUNG
working on cell line HUH28_BILIARY_TRACT
working on cell line ONCODG1_OVARY
working on cell line KALS1_CENTRAL_NERVOUS_SYSTEM
working on cell line OAW28_OVARY
working on cell line OVCAR4_OVARY
working on cell line UACC257_SKIN
working on cell line SNU46_UPPER_AERODIGESTIVE_TRACT
In [70]:
cell_line_name = 'ONCODG1_OVARY'
single_cell_line_exp = cell_lines_dict[cell_line_name]
single_cell_line_exp.head()
Out[70]:
barcode AAACGGGCAGGACCCT-1 AAACGGGGTGCGCTTG-1 AAAGCAATCCTTGGTC-1 AAAGTAGTCGGCGCAT-1 AACTCCCAGGTGGGTT-1 AACTCTTGTCCGACGT-1 AAGACCTAGATGCCAG-1 ACACCCTGTTGGTTTG-1 ACATACGGTGACGGTA-1 ACATCAGGTTGATTCG-1 ... TGTCCCAAGAACTGTA-1 TGTCCCACATCATCCC-1 TGTCCCATCAACCATG-1 TTAACTCTCTTGTCAT-1 TTAGGACCACACATGT-1 TTAGGACCATTACCTT-1 TTAGGCAGTCCAAGTT-1 TTCGGTCCAGTATGCT-1 TTGGCAATCGTACGGC-1 TTGTAGGTCTATCCTA-1
35 0.966179 0.827275 1.444698 0.829589 0.029701 0.074532 -1.101448 0.518789 -1.013906 0.435348 ... 0.314207 0.473129 0.537795 1.947978 1.196200 0.045039 1.381143 0.601393 -0.392924 0.113426
40 -0.932603 -1.987593 -0.210870 2.148755 -0.702067 -0.057083 0.417037 1.148604 -0.110808 1.893650 ... 1.828948 -1.112819 0.980832 1.930923 1.482118 1.695409 -1.482298 -0.560711 3.981946 -0.969197
42 -1.784793 -2.142332 -0.429050 -0.221174 2.253533 -0.316412 0.390324 -1.932444 0.337614 -1.359494 ... -1.770711 -1.976564 -1.547123 2.061704 -0.497109 -0.638035 -0.585075 -1.483525 -0.267565 -0.915321
43 0.808322 1.310276 1.731083 1.636191 0.233739 -1.750702 1.150416 0.341563 0.762365 0.593999 ... 1.323363 1.117510 0.904624 0.840673 -0.992459 0.416993 1.067309 0.117343 1.104821 0.771727
46 1.139670 0.858552 1.320291 1.408857 -0.645481 0.509286 2.030721 -0.645481 1.867586 0.958484 ... 0.868190 1.733326 0.822648 -0.645481 0.112762 1.159826 0.683915 -0.645481 1.721453 0.195216

5 rows × 146 columns

In [71]:
# Change from log2((TPM/10)+1) to TPM
tpm_data = single_cell_line_exp.copy()
tpm_data = tpm_data.apply(lambda x: 10*((2**x)-1))
tpm_data.head()
Out[71]:
barcode AAACGGGCAGGACCCT-1 AAACGGGGTGCGCTTG-1 AAAGCAATCCTTGGTC-1 AAAGTAGTCGGCGCAT-1 AACTCCCAGGTGGGTT-1 AACTCTTGTCCGACGT-1 AAGACCTAGATGCCAG-1 ACACCCTGTTGGTTTG-1 ACATACGGTGACGGTA-1 ACATCAGGTTGATTCG-1 ... TGTCCCAAGAACTGTA-1 TGTCCCACATCATCCC-1 TGTCCCATCAACCATG-1 TTAACTCTCTTGTCAT-1 TTAGGACCACACATGT-1 TTAGGACCATTACCTT-1 TTAGGCAGTCCAAGTT-1 TTCGGTCCAGTATGCT-1 TTGGCAATCGTACGGC-1 TTGTAGGTCTATCCTA-1
35 9.536600 7.743314 17.220575 7.771784 0.208008 0.530194 -5.339516 4.327519 -5.047964 3.522374 ... 2.433278 3.881170 4.517516 28.583330 12.913530 0.317111 16.047458 5.171811 -2.384155 0.817943
40 -4.760876 -7.478407 -1.359840 34.344502 -3.853090 -0.387946 3.351827 12.169928 -0.739307 27.157418 ... 25.527804 -5.376105 9.736033 28.129897 17.935846 22.386874 -6.420816 -3.220320 148.010260 -4.892099
42 -7.097823 -7.734867 -2.572493 -1.421329 37.684908 -1.969354 3.106873 -7.380151 2.636653 -6.102809 ... -7.069358 -7.459057 -6.578083 31.747909 -2.914750 -3.574126 -3.333854 -6.423860 -1.692793 -4.697740
43 7.511732 14.798900 23.197686 21.084397 1.758787 -7.028428 12.197791 2.671287 6.962685 5.094254 ... 15.024876 11.697216 8.720565 7.908851 -4.973796 3.351420 10.955216 0.847356 11.507215 7.073121
46 12.033054 8.132181 14.971654 16.552677 -3.607202 4.233452 30.860897 -3.607202 26.492136 9.432672 ... 8.253718 23.249346 7.686497 -3.607202 0.812965 12.343053 6.064934 -3.607202 22.976844 1.448952

5 rows × 146 columns

In [72]:
exp_obj.composing_items
Out[72]:
[<PreprocessingPipeline.Transformers.UploadMatExpressionMatrix.UploadMatExpressionMatrix at 0x140d477f0>,
 <PreprocessingPipeline.Transformers.NormalizeExpressionByCell.NormalizeExpressionByCell at 0x116b2ecc0>,
 <PreprocessingPipeline.Transformers.CellFilteringByReadsThreshold.CellFilteringByReadsThreshold at 0x116b2e400>,
 <PreprocessingPipeline.Transformers.FilterGenesByPopulationExpression.FilterGenesByPopulationExpression at 0x116b2ee48>,
 <PreprocessingPipeline.Transformers.TransformToTPMAndCenter.TransformToTPMAndCenter at 0x116b2e748>,
 <PreprocessingPipeline.Transformers.ClusterExpressionIntoTree.ClusterExpressionIntoTree at 0x116b2e668>]
In [73]:
tmp_original_exp = original_exp_obj.expression_matrix.loc[single_cell_line_exp.index, single_cell_line_exp.columns]
gene_sum_values = tmp_original_exp.values
gene_sum_values = gene_sum_values.mean(axis=1)
gene_sum_values += 1
gene_sum_values = np.log2(gene_sum_values)
fig, ax = plt.subplots(2, 1)
ax[0].plot(range(len(gene_sum_values)), np.sort(gene_sum_values))
ax[1].hist(gene_sum_values, bins=30)
plt.show()

According to these results we will add create a second threshold of 3

Recenter genes (and then re-run and save clustering)

In [74]:
centered_cell_lines_dict = {}
for cell_line, cell_line_exp in cell_lines_dict.items():
    centered_cell_lines_dict[cell_line] = {}
    centered_cell_lines_dict[cell_line]['exp_mat'] = cell_line_exp.sub(cell_line_exp.mean(axis=1), axis=0)
In [75]:
for cell_line, exp_matrix in centered_cell_lines_dict.items():
    print('working on {}, of shape {}'.format(cell_line, exp_matrix['exp_mat'].shape))
    corr_dist, tree, dn = create_clustering(exp_matrix['exp_mat'], show=True)
    centered_cell_lines_dict[cell_line]['corr_dist'] = corr_dist
    centered_cell_lines_dict[cell_line]['tree'] = tree
    centered_cell_lines_dict[cell_line]['dn'] = dn
    show_correlation_heatmap(exp_matrix['exp_mat'], dn)
working on TE14_OESOPHAGUS, of shape (5327, 225)
(225, 225) 25200 (5327, 225) 25200.0
vmax=0.20887820314474934, vmin=-0.20887820314474934
(225, 225)
working on BICR16_UPPER_AERODIGESTIVE_TRACT, of shape (5581, 87)
(87, 87) 3741 (5581, 87) 3741.0
vmax=0.17349120254623268, vmin=-0.17349120254623268
(87, 87)
working on CAKI2_KIDNEY, of shape (5724, 138)
(138, 138) 9453 (5724, 138) 9453.0
vmax=0.1748980413731182, vmin=-0.1748980413731182
(138, 138)
working on 8305C_THYROID, of shape (5843, 185)
(185, 185) 17020 (5843, 185) 17020.0
vmax=0.2061412573083114, vmin=-0.2061412573083114
(185, 185)
working on NCIH2444_LUNG, of shape (6228, 79)
(79, 79) 3081 (6228, 79) 3081.0
vmax=0.20861554060237805, vmin=-0.20861554060237805
(79, 79)
working on SCC25_UPPER_AERODIGESTIVE_TRACT, of shape (5780, 88)
(88, 88) 3828 (5780, 88) 3828.0
vmax=0.1952111860672239, vmin=-0.1952111860672239
(88, 88)
working on U118MG_CENTRAL_NERVOUS_SYSTEM, of shape (6144, 102)
(102, 102) 5151 (6144, 102) 5151.0
vmax=0.2121178775070976, vmin=-0.2121178775070976
(102, 102)
working on ASPC1_PANCREAS, of shape (5432, 97)
(97, 97) 4656 (5432, 97) 4656.0
vmax=0.15657522809531704, vmin=-0.15657522809531704
(97, 97)
working on SKMEL5_SKIN, of shape (6061, 103)
(103, 103) 5253 (6061, 103) 5253.0
vmax=0.1767281241477743, vmin=-0.1767281241477743
(103, 103)
working on HEC6_ENDOMETRIUM, of shape (6443, 65)
(65, 65) 2080 (6443, 65) 2080.0
vmax=0.1360107081200351, vmin=-0.1360107081200351
(65, 65)
working on NCIH2228_LUNG, of shape (6019, 130)
(130, 130) 8385 (6019, 130) 8385.0
vmax=0.2149746795399129, vmin=-0.2149746795399129
(130, 130)
working on HUH28_BILIARY_TRACT, of shape (6317, 76)
(76, 76) 2850 (6317, 76) 2850.0
vmax=0.22657137399663158, vmin=-0.22657137399663158
(76, 76)
working on ONCODG1_OVARY, of shape (5112, 146)
(146, 146) 10585 (5112, 146) 10585.0
vmax=0.2416775320656548, vmin=-0.2416775320656548
(146, 146)
working on KALS1_CENTRAL_NERVOUS_SYSTEM, of shape (6113, 115)
(115, 115) 6555 (6113, 115) 6555.0
vmax=0.17430244474782539, vmin=-0.17430244474782539
(115, 115)
working on OAW28_OVARY, of shape (5928, 72)
(72, 72) 2556 (5928, 72) 2556.0
vmax=0.2599233118321043, vmin=-0.2599233118321043
(72, 72)
working on OVCAR4_OVARY, of shape (5937, 149)
(149, 149) 11026 (5937, 149) 11026.0
vmax=0.21310183069397215, vmin=-0.21310183069397215
(149, 149)
working on UACC257_SKIN, of shape (5901, 100)
(100, 100) 4950 (5901, 100) 4950.0
vmax=0.1553973347511733, vmin=-0.1553973347511733
(100, 100)
working on SNU46_UPPER_AERODIGESTIVE_TRACT, of shape (5728, 125)
(125, 125) 7750 (5728, 125) 7750.0
vmax=0.24977081264559245, vmin=-0.24977081264559245
(125, 125)

Cluster filtering

Computer differential expression for each cluster in each cell line

For each gene we compute the t_test between cluster and non_cluster cells. Meaningul clusters are of p-value lower than $10^{-4}$ or $10^{-5}$

In [76]:
def calculate_jaccard_sim_binary(a, b):
    return np.logical_and(a,b).sum() / float(np.logical_or(a, b).sum())
In [86]:
class Tree:
    def __init__(self, linkage_mat, exp_matrix, dend, bottom_threshold=5, top_threshold_percentage=80.0):
        self.linkage_matrix = linkage_mat
        self.exp_matrix = exp_matrix
        self.dn = dend
        self.num_original_elements = self.linkage_matrix.shape[0]+1
        self.children_dict = {}
        self.get_children_dict()
        self.bottom_threshold = bottom_threshold
        self.top_threshold_percentage = top_threshold_percentage
        self.top_threshold = int(math.ceil(self.top_threshold_percentage*self.num_original_elements/100.0))
        print("Using boundaries of {} and {}".format(self.bottom_threshold, self.top_threshold))
        self.filter_clusters_by_size()
    
    def get_children_dict(self):
        for i in range(self.num_original_elements):
            self.children_dict[i] = set([i])
        for i in range(self.linkage_matrix.shape[0]):
            child_1, child_2, _, num_elements = self.linkage_matrix[i]
            try:
                self.children_dict[i+self.num_original_elements] = self.children_dict[
                    int(child_1)].union(self.children_dict[int(child_2)])
                if not len(self.children_dict[i+self.num_original_elements])==num_elements:
                    print("wrong number of elements")
                    raise IOError
            except:
                print("failed at cluster #{}".format(i))
                print(i+1+self.num_original_elements)
                print(child_1)
    
    def filter_clusters_by_size(self):
        self.filtered_clusters = {}
        for clust_id, cluster in self.children_dict.items():
            if len(cluster)>=self.bottom_threshold and len(cluster)<=self.top_threshold:
                self.filtered_clusters[clust_id] = cluster
                
    def create_differential_expression_matrix(self, p_value_threshold=10**-4, fold_change_threshold=2):
        differential_expression_per_cluster = {}
        most_significant_gene_express = pd.DataFrame(
            index=self.exp_matrix.index)
        for clust_id, cluster in self.filtered_clusters.items():
        #     print(clust_id)
            in_clust = centered_cell_lines_dict[cell_line]['exp_mat'].iloc[:, list(cluster)]
            out_clust = centered_cell_lines_dict[cell_line]['exp_mat'].iloc[
                :, list(set(range(
                    centered_cell_lines_dict[cell_line]['exp_mat'].shape[1])) - cluster)]
            diff_exp_genes = pd.DataFrame(data = in_clust.index, columns=['gene_number'])
            diff_exp_genes['ttest_res'] = diff_exp_genes['gene_number'].apply(
                lambda x: scipy.stats.ttest_ind(in_clust.loc[x], out_clust.loc[x]))
            diff_exp_genes['t_statistic'] = diff_exp_genes.ttest_res.apply(lambda x: x[0])
            diff_exp_genes['p_value'] = diff_exp_genes.ttest_res.apply(lambda x: x[1])
            most_significant_gene_express[clust_id] = diff_exp_genes['p_value']
            diff_exp_genes['fold_change'] = diff_exp_genes['gene_number'].apply(
                lambda x: np.absolute(np.mean(in_clust.loc[x]) - np.mean(out_clust.loc[x])))
            diff_exp_genes['is_significant'] = diff_exp_genes.apply(
                lambda x: x['p_value']<=p_value_threshold and x['fold_change']>=fold_change_threshold, axis=1)
#             print("cluster {}, {} differentially expressed genes".format(
#                 clust_id,diff_exp_genes['is_significant'].sum()))
            differential_expression_per_cluster[clust_id] = diff_exp_genes
        self.most_significant_gene_express = most_significant_gene_express
        self.differential_expression_per_cluster = differential_expression_per_cluster
        self.most_significant_cluster_per_gene = self.most_significant_gene_express.apply(
            lambda x: x.idxmin(skipna=True), axis=1).dropna()
        
    def get_significant_clusters(self, min_significant_genes=30, min_most_significant_genes=5, **kwargs):
        if not hasattr(self, 'differential_expression_per_cluster'):
            self.create_differential_expression_matrix(**kwargs)
        significant_clusters = pd.DataFrame(index=['num_significant_genes', 'num_most_significant_genes'])
        for clust_id, cluster in self.filtered_clusters.items():
            try:
                num_significant_genes = self.differential_expression_per_cluster[clust_id]['is_significant'].sum()
            except:
                print('tmp.differential_expression_per_cluster {}, clust_id {}, res'.format(
                    self.differential_expression_per_cluster, clust_id, 
                    self.differential_expression_per_cluster[clust_id]['is_significant'].sum()))
                raise
            num_most_significant_genes = sum(self.most_significant_cluster_per_gene==clust_id)
            if num_significant_genes>=min_significant_genes or \
            num_most_significant_genes>=min_most_significant_genes:
                significant_clusters[clust_id] = [num_significant_genes, num_most_significant_genes]
        print(significant_clusters.shape)
        significant_clusters = significant_clusters.transpose()
        significant_clusters = significant_clusters.reset_index().rename(
            columns={'index':'cluster_id'}).set_index('cluster_id')
        significant_clusters.sort_values(
            by=['num_significant_genes', 'num_most_significant_genes'], 
            inplace=True, ascending=False)
        self.significant_clusters = significant_clusters
        
    def visualize_meaningful_clusters(self, **kwargs):
        if not hasattr(self, 'significant_clusters'):
            self.get_significant_clusters(**kwargs)
        cells_vs_clusters = pd.DataFrame(data = np.array(
            [False]*self.num_original_elements*self.significant_clusters.shape[0]).reshape(
            self.significant_clusters.shape[0], self.num_original_elements),
            index=self.significant_clusters.index, 
            columns=range(self.num_original_elements))
        for cluster_id in self.significant_clusters.index:
            cells_vs_clusters.loc[cluster_id].iloc[list(self.children_dict[cluster_id])] = True
        cells_vs_clusters = cells_vs_clusters.iloc[:, centered_cell_lines_dict[cell_line]['dn']['leaves']]
        cm = matplotlib.colors.LinearSegmentedColormap.from_list('BinaryColormap', ['black', 'yellow'], N=2)
        self.cm = cm
        fig, ax = plt.subplots(2, 1, figsize=(8, 16))

        corr_matrix = np.corrcoef(
            self.exp_matrix.iloc[
                :, self.dn['leaves']].transpose())
        mask = corr_matrix==1
        sns.heatmap(
            corr_matrix, 
            xticklabels=False, 
            yticklabels=False,
            cmap='seismic', 
            vmax=0.3, 
            vmin=-0.3, 
            ax=ax[0])
        print("significant shape matrix {}".format(cells_vs_clusters.shape))
        sns.heatmap(cells_vs_clusters, cmap=cm, ax=ax[1], linewidths=0.01)
        ax[1].set_xlabel('Cells')
        ax[1].get_xaxis().set_ticks(range(cells_vs_clusters.shape[1]))
        ax[1].get_yaxis().set_ticks(range(cells_vs_clusters.shape[0]))
        ax[1].tick_params(labelbottom=False, labelleft=False)
        self.cells_vs_clusters = cells_vs_clusters
        
    def compute_intersection_between_clusters(self, maximal_interaction=0.85):
        intersections_between_clusters = pd.DataFrame(
            index=self.cells_vs_clusters.index, 
            columns=self.cells_vs_clusters.index)
        for row in intersections_between_clusters.index.values:
            for col in intersections_between_clusters.columns.values:
                intersections_between_clusters.loc[row, col] = calculate_jaccard_sim_binary(
                    self.cells_vs_clusters.loc[row], self.cells_vs_clusters.loc[col])
        intersections_between_clusters -= np.identity(intersections_between_clusters.shape[0])
        clusters_to_keep = set(intersections_between_clusters.index)
        print(len(clusters_to_keep))
        while len(np.where(intersections_between_clusters>maximal_interaction)[0])>0:
            clust_1 = intersections_between_clusters.index[np.where(intersections_between_clusters>0.85)[0][0]]
            clust_2 = intersections_between_clusters.index[np.where(intersections_between_clusters>0.85)[1][0]]
            print(clust_1, clust_2)
            if self.significant_clusters.loc[
                clust_1].num_significant_genes > self.significant_clusters.loc[clust_2].num_significant_genes:
                intersections_between_clusters.drop(labels=[clust_2], axis=0, inplace=True)
                intersections_between_clusters.drop(labels=[clust_2], axis=1, inplace=True)
            elif self.significant_clusters.loc[
                clust_1].num_significant_genes < self.significant_clusters.loc[clust_2].num_significant_genes:
                intersections_between_clusters.drop(labels=[clust_1], axis=0, inplace=True)
                intersections_between_clusters.drop(labels=[clust_1], axis=1, inplace=True)
            else:
                if self.significant_clusters.loc[
                    clust_1].num_most_significant_genes > self.significant_clusters.loc[
                    clust_2].num_most_significant_genes:
                    intersections_between_clusters.drop(labels=[clust_2], axis=0, inplace=True)
                    intersections_between_clusters.drop(labels=[clust_2], axis=1, inplace=True)
                else:
                    print("clusters {} and {} are weirdly similar".format(clust_1, clust_2))
                    intersections_between_clusters.drop(labels=[clust_1], axis=0, inplace=True)
                    intersections_between_clusters.drop(labels=[clust_1], axis=1, inplace=True)
        self.intersections_between_clusters = intersections_between_clusters
        fig, ax = plt.subplots(2, 1, figsize=(8, 10))

        corr_matrix = np.corrcoef(
            self.exp_matrix.iloc[
                :, self.dn['leaves']].transpose())
        mask = corr_matrix==1
        sns.heatmap(
            corr_matrix, 
            xticklabels=False, 
            yticklabels=False,
            cmap='seismic', 
            vmax=0.5, 
            vmin=-0.5, 
            ax=ax[0])
        print('intersection shape matrix {}'.format(self.cells_vs_clusters.loc[self.intersections_between_clusters.index].shape))
        sns.heatmap(
            self.cells_vs_clusters.loc[self.intersections_between_clusters.index], 
            cmap=self.cm, ax=ax[1], linewidths=0.01)
        ax[1].set_xlabel('Cells')
        ax[1].get_xaxis().set_ticks(range(self.cells_vs_clusters.shape[1]))
        ax[1].get_yaxis().set_ticks(range(self.intersections_between_clusters.shape[0]))
        ax[1].tick_params(labelbottom=False, labelleft=False)

        plt.tight_layout()
        plt.show()
In [83]:
cell_line = 'KALS1_CENTRAL_NERVOUS_SYSTEM'
tmp = Tree(centered_cell_lines_dict[cell_line]['tree'], 
           centered_cell_lines_dict[cell_line]['exp_mat'],
           centered_cell_lines_dict[cell_line]['dn'])
tmp.visualize_meaningful_clusters()
tmp.compute_intersection_between_clusters()
Using boundaries of 5 and 92
(2, 42)
significant shape matrix (42, 115)
42
181 197
213 207
188 171
200 186
intersection shape matrix (38, 115)
In [87]:
trees_per_cell_line = {}
for cell_line, cell_line_dict in centered_cell_lines_dict.items():
    print("working on cell line {}".format(cell_line))
    ret = Tree(cell_line_dict['tree'], 
               cell_line_dict['exp_mat'],
               cell_line_dict['dn'])
    ret.visualize_meaningful_clusters()
    ret.compute_intersection_between_clusters()
    trees_per_cell_line[cell_line] = ret
working on cell line TE14_OESOPHAGUS
Using boundaries of 5 and 180
(2, 55)
significant shape matrix (55, 225)
55
318 390
275 265
275 261
275 278
275 266
275 312
275 314
358 380
437 442
440 443
440 444
414 425
clusters 414 and 425 are weirdly similar
intersection shape matrix (43, 225)
working on cell line BICR16_UPPER_AERODIGESTIVE_TRACT
Using boundaries of 5 and 70
(2, 35)
significant shape matrix (35, 87)
35
125 117
168 170
132 140
132 144
153 156
155 166
169 167
159 162
163 165
intersection shape matrix (26, 87)
working on cell line CAKI2_KIDNEY
Using boundaries of 5 and 111
(2, 52)
significant shape matrix (52, 138)
52
142 143
166 180
217 236
186 210
160 188
213 203
250 258
246 225
intersection shape matrix (44, 138)
working on cell line 8305C_THYROID
Using boundaries of 5 and 148
(2, 63)
significant shape matrix (63, 185)
63
298 344
298 335
269 284
286 306
283 262
321 343
285 275
349 355
349 359
302 295
302 320
342 331
364 365
364 358
346 350
346 328
intersection shape matrix (47, 185)
working on cell line NCIH2444_LUNG
Using boundaries of 5 and 64
(2, 30)
significant shape matrix (30, 79)
30
87 101
100 119
146 143
134 137
139 141
153 152
130 124
intersection shape matrix (23, 79)
working on cell line SCC25_UPPER_AERODIGESTIVE_TRACT
Using boundaries of 5 and 71
(2, 37)
significant shape matrix (37, 88)
37
110 127
145 160
129 139
163 167
168 165
169 170
intersection shape matrix (31, 88)
working on cell line U118MG_CENTRAL_NERVOUS_SYSTEM
Using boundaries of 5 and 82
(2, 37)
significant shape matrix (37, 102)
37
126 127
196 199
180 190
171 164
160 141
176 178
intersection shape matrix (31, 102)
working on cell line ASPC1_PANCREAS
Using boundaries of 5 and 78
(2, 39)
significant shape matrix (39, 97)
39
105 132
170 168
170 185
156 166
154 137
182 176
163 171
intersection shape matrix (32, 97)
working on cell line SKMEL5_SKIN
Using boundaries of 5 and 83
(2, 37)
significant shape matrix (37, 103)
37
108 112
115 126
115 174
146 157
178 173
intersection shape matrix (32, 103)
working on cell line HEC6_ENDOMETRIUM
Using boundaries of 5 and 52
(2, 20)
significant shape matrix (20, 65)
20
121 120
116 111
103 92
intersection shape matrix (17, 65)
working on cell line NCIH2228_LUNG
Using boundaries of 5 and 104
(2, 54)
significant shape matrix (54, 130)
54
229 237
229 242
224 203
249 240
249 252
173 163
245 247
245 250
227 243
199 180
253 255
241 238
197 219
intersection shape matrix (41, 130)
working on cell line HUH28_BILIARY_TRACT
Using boundaries of 5 and 61
(2, 31)
significant shape matrix (31, 76)
31
143 126
96 101
147 146
125 132
139 142
127 123
108 103
135 138
intersection shape matrix (23, 76)
working on cell line ONCODG1_OVARY
Using boundaries of 5 and 117
(2, 49)
significant shape matrix (49, 146)
49
170 188
195 201
250 264
250 228
259 266
259 245
272 279
275 271
275 286
275 281
280 287
273 282
intersection shape matrix (37, 146)
working on cell line KALS1_CENTRAL_NERVOUS_SYSTEM
Using boundaries of 5 and 92
(2, 42)
significant shape matrix (42, 115)
42
181 197
213 207
188 171
200 186
intersection shape matrix (38, 115)
working on cell line OAW28_OVARY
Using boundaries of 5 and 58
(2, 30)
significant shape matrix (30, 72)
30
111 118
107 117
133 135
138 139
clusters 138 and 139 are weirdly similar
115 124
intersection shape matrix (25, 72)
working on cell line OVCAR4_OVARY
Using boundaries of 5 and 120
(2, 49)
significant shape matrix (49, 149)
49
256 251
203 231
287 273
250 189
261 241
291 290
292 294
276 274
intersection shape matrix (41, 149)
working on cell line UACC257_SKIN
Using boundaries of 5 and 80
(2, 40)
significant shape matrix (40, 100)
40
197 196
189 194
189 184
142 183
138 132
163 158
185 187
156 160
intersection shape matrix (32, 100)
working on cell line SNU46_UPPER_AERODIGESTIVE_TRACT
Using boundaries of 5 and 100
(2, 48)
significant shape matrix (48, 125)
48
151 149
165 169
170 190
228 201
219 234
218 233
clusters 218 and 233 are weirdly similar
156 172
241 235
187 177
187 174
187 188
227 220
212 215
212 204
intersection shape matrix (34, 125)

sort clusters by score

  1. Move from working on single cell line to compare between cell lines. Search programs that repeat between cell lines. Join programs from different cell lines, combine them to metaprograms. For each program we defined it for a different set of cells. We need to move to differentially expressed genes. Two options:
  2. For each program take the 50 top genes per program that are most significantly differentially expressed. Make sure that most significant genes enter these 50 top genes. If not replace them with the last genes. Now we want to compare programs by either jaccard index or size of intersection. Add a figure below of from which cell line each program arrived to make sure we do not get clusters by cell lines.

From clusters to programs

We are moving our work now from per-cell to per-program (where a program is defined by a cluster). This requires the following steps:

  1. Convert clusters to programs. In order to do this we are going to look at the 50 top differentially expressed genes within a cluster. These will define the program of a cluster. When looking at said 50 top genes, we need to ensure that all most significant genes that are most differentially expressed in the cluster are in the cluster's program. If not then we will add them and remove the genes with the lowest differential expression that are not "most significantly expressed".
  2. We now have all significant clusters from all cell lines, and their programs defined by sets of 50 genes. We now work on defining a distance metric between different programs. To do this we could use the Jaccard index, however, since the size of the union of two programs is at most 100, and decreases when programs are closely related, we are going to use the size of the intersection as the similarity matrix.
  3. Once we have the different programs and their similarity matrix we will cluster them to get a new order of programs. We will show this in a heatmap and below it we will show a heatmp like (use ticks) showing which cell line each program came from. This to ensure we do not end up with clusters that are only based on original cell line of a program.

Convert clusters to programs

In [91]:
cell_line = 'TE14_OESOPHAGUS'
trees_per_cell_line[cell_line].__dict__.keys()
Out[91]:
dict_keys(['linkage_matrix', 'exp_matrix', 'dn', 'num_original_elements', 'children_dict', 'bottom_threshold', 'top_threshold_percentage', 'top_threshold', 'filtered_clusters', 'most_significant_gene_express', 'differential_expression_per_cluster', 'most_significant_cluster_per_gene', 'significant_clusters', 'cm', 'cells_vs_clusters', 'intersections_between_clusters'])
In [104]:
trees_per_cell_line[cell_line].differential_expression_per_cluster[231]['cluster_id']=231
trees_per_cell_line[cell_line].differential_expression_per_cluster[231]
Out[104]:
gene_number ttest_res t_statistic p_value fold_change is_significant cluster_id
0 8 (0.2372747781506694, 0.8126614654644752) 0.237275 0.812661 0.113775 False 231
1 33 (-1.8586100411709225, 0.06440004642369351) -1.858610 0.064400 0.904683 False 231
2 35 (1.7489707922989446, 0.0816719152981399) 1.748971 0.081672 0.761962 False 231
3 40 (-1.7647504096594875, 0.0789745966283989) -1.764750 0.078975 1.067360 False 231
4 42 (-0.08631112478758983, 0.9312965416358695) -0.086311 0.931297 0.085630 False 231
5 53 (-1.035731627038093, 0.30144919130771664) -1.035732 0.301449 0.360745 False 231
6 54 (-0.24877998775971513, 0.8037599400552214) -0.248780 0.803760 0.124960 False 231
7 57 (0.08330173187842133, 0.9336863506037095) 0.083302 0.933686 0.030207 False 231
8 61 (0.6346305922816224, 0.5263199132814738) 0.634631 0.526320 0.295575 False 231
9 62 (1.98734101278703, 0.048108205985076945) 1.987341 0.048108 0.995982 False 231
10 63 (-0.00343917321506372, 0.9972590167873642) -0.003439 0.997259 0.001630 False 231
11 67 (-1.194370445624271, 0.23360193771214582) -1.194370 0.233602 0.212258 False 231
12 68 (0.7038269402276123, 0.4822748146458975) 0.703827 0.482275 0.318512 False 231
13 69 (-1.142187593215228, 0.25460094260922994) -1.142188 0.254601 0.490960 False 231
14 70 (-0.5363844914640841, 0.592227511468858) -0.536384 0.592228 0.107202 False 231
15 76 (0.30550048045469935, 0.7602699806556827) 0.305500 0.760270 0.146633 False 231
16 78 (-0.7689439371860816, 0.4427402163199672) -0.768944 0.442740 0.384396 False 231
17 79 (1.092295923992701, 0.27588169610210217) 1.092296 0.275882 0.521494 False 231
18 81 (-2.1045519742217245, 0.036450113001747636) -2.104552 0.036450 0.542279 False 231
19 93 (-1.0149970800395627, 0.31120719087164284) -1.014997 0.311207 0.455544 False 231
20 94 (-2.287157036629165, 0.02312464654370643) -2.287157 0.023125 0.752593 False 231
21 105 (-1.5796241435489438, 0.11561033384050853) -1.579624 0.115610 0.530842 False 231
22 113 (0.7343835747822972, 0.46348648647999446) 0.734384 0.463486 0.215722 False 231
23 114 (-0.5610400645549954, 0.5753335338556254) -0.561040 0.575334 0.251288 False 231
24 140 (2.0467567717595756, 0.041854711696001703) 2.046757 0.041855 0.910975 False 231
25 147 (0.36242768761520155, 0.7173756939309437) 0.362428 0.717376 0.167401 False 231
26 149 (0.2786599648134135, 0.7807640193574659) 0.278660 0.780764 0.126682 False 231
27 151 (-0.16433429936170402, 0.8696168364406117) -0.164334 0.869617 0.078671 False 231
28 166 (1.2066949511265095, 0.2288281584803582) 1.206695 0.228828 0.190694 False 231
29 169 (-2.572537997550662, 0.010744570050498073) -2.572538 0.010745 1.042579 False 231
... ... ... ... ... ... ... ...
5297 32538 (-1.0220646924608126, 0.3078577616058711) -1.022065 0.307858 0.487166 False 231
5298 32543 (1.0844004135564385, 0.2793582796626059) 1.084400 0.279358 0.658767 False 231
5299 32576 (-1.1420469273264389, 0.2546592799913542) -1.142047 0.254659 0.485666 False 231
5300 32579 (1.4954198936050176, 0.1362186938121001) 1.495420 0.136219 0.667648 False 231
5301 32581 (2.1607225203990534, 0.03178249501081319) 2.160723 0.031782 0.740326 False 231
5302 32593 (0.7961045356578588, 0.42681793297280823) 0.796105 0.426818 0.399883 False 231
5303 32594 (0.7605319727445838, 0.4477400394616984) 0.760532 0.447740 0.325390 False 231
5304 32595 (0.4098702344622838, 0.6822946025121828) 0.409870 0.682295 0.104564 False 231
5305 32596 (0.4887669832026677, 0.6254870018758532) 0.488767 0.625487 0.232559 False 231
5306 32601 (1.0658070927444034, 0.2876634315045307) 1.065807 0.287663 0.491821 False 231
5307 32611 (1.7377854004642883, 0.08362912045560998) 1.737785 0.083629 0.822157 False 231
5308 32616 (-0.1904072682763386, 0.8491632239671553) -0.190407 0.849163 0.090476 False 231
5309 32641 (-0.03248384141610995, 0.9741152572837011) -0.032484 0.974115 0.015114 False 231
5310 32643 (-1.0471177898560842, 0.2961788623824007) -1.047118 0.296179 0.335866 False 231
5311 32644 (1.3770537615237728, 0.16987636763619032) 1.377054 0.169876 0.646572 False 231
5312 32649 (0.11939477921256138, 0.9050700865470441) 0.119395 0.905070 0.051810 False 231
5313 32690 (1.5757748966715004, 0.11649525450272358) 1.575775 0.116495 0.692975 False 231
5314 32695 (0.06448232543910126, 0.9486439457044011) 0.064482 0.948644 0.027629 False 231
5315 32696 (-2.316373117137012, 0.021445757239940768) -2.316373 0.021446 0.480237 False 231
5316 32697 (-2.5609820510586494, 0.01109779644579459) -2.560982 0.011098 0.514374 False 231
5317 32698 (-2.8208916425353157, 0.005220910209609719) -2.820892 0.005221 0.482427 False 231
5318 32699 (-2.1010200650265296, 0.03676231323078741) -2.101020 0.036762 0.414283 False 231
5319 32701 (-2.6688088169721977, 0.008171492653162823) -2.668809 0.008171 0.566952 False 231
5320 32702 (-2.9683455159554555, 0.003320963808350212) -2.968346 0.003321 0.575978 False 231
5321 32703 (-2.7223749767477283, 0.006993851612293454) -2.722375 0.006994 0.618545 False 231
5322 32704 (0.0607734477492907, 0.9515940673926844) 0.060773 0.951594 0.027194 False 231
5323 32705 (-2.2475833121399846, 0.0255813423831772) -2.247583 0.025581 0.434452 False 231
5324 32706 (-2.143363886175726, 0.033166549677116595) -2.143364 0.033167 0.604827 False 231
5325 32707 (-1.3227229130480838, 0.187282378874555) -1.322723 0.187282 0.669393 False 231
5326 32708 (-2.458505759468182, 0.014713430644409914) -2.458506 0.014713 0.547494 False 231

5327 rows × 7 columns